Index: lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.cpp +++ lib/Target/PowerPC/PPCFrameLowering.cpp @@ -28,6 +28,28 @@ using namespace llvm; +// It changes SPUpdate (stack pointer update) instruction position, in order to +// create more reorder chances for scheduler. +// Before After +// +// prologue: +// mflr 0 stdu 1, -240(1) +// mfcr 12 .. +// std 0, 16(1) (scheduler gets more reorder chances) +// stw 12, 8(1) +// stdu 1, -240(1) +// +// epilogue: +// addi 1, 1, 240 (scheduler gets more reorder chances) +// ld 0, 16(1) +// lwz 12, 8(1) +// mtocrf 32, 12 +// mtlr 0 addi 1, 1, 240 +// blr blr +static cl::opt RelaxPrologEpilog( + "ppc-relax-prologepilog-inst-order", cl::Hidden, cl::init(true), + cl::desc("Create instruction reorder chances for post-RA-scheduler")); + /// VRRegNo - Map from a numbered VR register to its enum value. /// static const MCPhysReg VRRegNo[] = { @@ -692,6 +714,61 @@ return findScratchRegister(TmpMBB, true); } +// Indicate the memory operand of MI (load/store) is a stack object. +// E.g. Assume MI = std r0, 16(r1) +// Then the function add "mem:ST8[FixedStack-1]" to 16(r1), for indicating +// 16(r1) is a stack object memory operand. +// +// So scheduler can reorder MI with other load/stores. +// +// Without this, MI is treated as an global memory object load/store when +// building schedule dependency graph (ScheduleDAGInstrs::buildSchedGraph), +// cause MI can not be reordered by scheduler. +static void addMemOperandAsStackObj(MachineInstr* MI, int Offset, + MachineFunction &MF, bool IsPPC64, bool IsStore) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + unsigned PtrByteSize = IsPPC64 ? 8 : 4; + + // The Offset is an index to stack object list, check if it has been added + // to the list before + int FixedObjIdx = MFI->getObjectIndexBegin(); + const int EndFixedObjIdx = 0; + for (; FixedObjIdx != EndFixedObjIdx; ++FixedObjIdx) + if (Offset == MFI->getObjectOffset(FixedObjIdx)) + break; + + int FrameIdx = FixedObjIdx; + + // Create new stack object + if (FixedObjIdx == EndFixedObjIdx) + FrameIdx = MFI->CreateFixedObject(PtrByteSize, Offset, true, false); + + // Indicate memory operand as a stack object + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + // Is MI a load or store instruction? + IsStore ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad, + MFI->getObjectSize(FrameIdx), + MFI->getObjectAlignment(FrameIdx)); + MI->addMemOperand(MF, MMO); +} + +static bool isEligibleForRelaxingPrologEpilog(bool IsPPC64, bool IsSVR4ABI, + bool IsLargeFrame, bool HasBP, bool UsePICBase, + unsigned FrameSize, int FPOffset, int LROffset, int CROffset) { + // Supporting: PPC64 with ELFv1 or ELFv2. + // And not supporting: Has Base Pointer, Use PIC Base, Large Frame + return RelaxPrologEpilog && + IsPPC64 && IsSVR4ABI && + !IsLargeFrame && !HasBP && !UsePICBase && + // We don't support if add frame size cause one of them exceed 16-bit + // immediate value + isInt<16>(FPOffset + FrameSize) && + isInt<16>(LROffset + FrameSize) && + isInt<16>(CROffset + FrameSize); +} + void PPCFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -865,40 +942,49 @@ MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill); } + int AdjustOffset = 0; + bool PlaceSPUpdateToTheBegin = false; + if (isEligibleForRelaxingPrologEpilog(isPPC64, isSVR4ABI, + isLargeFrame, HasBP, FI->usesPICBase(), + FrameSize, FPOffset, LROffset, 8)) { + AdjustOffset = FrameSize; + PlaceSPUpdateToTheBegin = true; + } + + auto SaveRegToStack = [&](unsigned Reg, int Offset) { + MachineInstr *MI = BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(Reg) + .addImm(Offset + AdjustOffset) + .addReg(SPReg); + addMemOperandAsStackObj(MI, Offset, MF, isPPC64, true /*IsStore*/); + }; + if (HasFP) // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. - BuildMI(MBB, MBBI, dl, StoreInst) - .addReg(FPReg) - .addImm(FPOffset) - .addReg(SPReg); + SaveRegToStack(FPReg, FPOffset); if (FI->usesPICBase()) // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. - BuildMI(MBB, MBBI, dl, StoreInst) - .addReg(PPC::R30) - .addImm(PBPOffset) - .addReg(SPReg); + SaveRegToStack(PPC::R30, PBPOffset); if (HasBP) // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. - BuildMI(MBB, MBBI, dl, StoreInst) - .addReg(BPReg) - .addImm(BPOffset) - .addReg(SPReg); + SaveRegToStack(BPReg, BPOffset); if (MustSaveLR) // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. - BuildMI(MBB, MBBI, dl, StoreInst) - .addReg(ScratchReg) - .addImm(LROffset) - .addReg(SPReg); + SaveRegToStack(ScratchReg, LROffset); if (MustSaveCR && !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64 - BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8)) - .addReg(TempReg, getKillRegState(true)) - .addImm(8) - .addReg(SPReg); + { + MachineInstr* MI = + BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8)) + .addReg(TempReg, getKillRegState(true)) + .addImm(8 + AdjustOffset) + .addReg(SPReg); + addMemOperandAsStackObj(MI, 8, MF, isPPC64, true /*IsStore*/); + } // Skip the rest if this is a leaf function & all spills fit in the Red Zone. if (!FrameSize) return; @@ -947,7 +1033,10 @@ .addReg(ScratchReg); } else if (!isLargeFrame) { - BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg) + MachineBasicBlock::iterator InsrtPos = MBBI; + if (PlaceSPUpdateToTheBegin) + InsrtPos = MBB.begin(); + BuildMI(MBB, InsrtPos, dl, StoreUpdtInst, SPReg) .addReg(SPReg) .addImm(NegFrameSize) .addReg(SPReg); @@ -1205,6 +1294,15 @@ // indexed into with a simple LD/LWZ immediate offset operand. bool isLargeFrame = !isInt<16>(FrameSize); + int AdjustOffset = 0; + bool PlaceSPUpdateToTheEnd = false; + if (isEligibleForRelaxingPrologEpilog(isPPC64, isSVR4ABI, + isLargeFrame, HasBP, FI->usesPICBase(), + FrameSize, FPOffset, LROffset, 8)) { + AdjustOffset = FrameSize; + PlaceSPUpdateToTheEnd = true; + } + if (FrameSize) { // In the prologue, the loaded (or persistent) stack pointer value is offset // by the STDU/STDUX/STWU/STWUX instruction. Add this offset back now. @@ -1238,8 +1336,36 @@ .addImm(0) .addReg(SPReg); } + + // Change insert point to Stack Pointer Update Instruction. So later stack + // object restore instructions are inserted before SPUpdate instr. + if (PlaceSPUpdateToTheEnd) + MBBI = --MBBI; } + // We need a stack pointer that keeps original sp value, so we can restore + // LR, CR, FP with correct offset. + // e.g. SPEC2006/403.gcc/real.c: asctoeg() + // Prologue: + // stdu r1,-432(r1) + // std r0,448(r1) + // mr r31,r1 <= FPReg contains original sp value + // Body: + // stdux r4,r1,r3 <= Because of Varied Sized Object, the SPReg + // is changed + // Epilogue: + // ld r0,448(r31) <= FPReg contains correct sp value, use it to + // restore LR, CR, FP. + // ld r1,0(r1) + // + // So when we have fp, just use it. + unsigned CorrectSPReg = SPReg; + if (PlaceSPUpdateToTheEnd && HasFP) + CorrectSPReg = FPReg; + + if (PlaceSPUpdateToTheEnd && MFI->hasVarSizedObjects()) + assert(HasFP && "Expecting a valid frame pointer."); + assert((isPPC64 || !MustSaveCR) && "Epilogue CR restoring supported only in 64-bit mode"); @@ -1248,39 +1374,39 @@ if (MustSaveCR && SingleScratchReg && MustSaveLR) { BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg) .addImm(8) - .addReg(SPReg); + .addReg(CorrectSPReg); for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i]) .addReg(TempReg, getKillRegState(i == e-1)); } + auto RestoreRegFromStack = [&](const MCInstrDesc& LoadInst, + unsigned Reg, int Offset) { + MachineInstr *MI = BuildMI(MBB, MBBI, dl, LoadInst, Reg) + .addImm(Offset + AdjustOffset) + .addReg(CorrectSPReg); + addMemOperandAsStackObj(MI, Offset, MF, isPPC64, false /*IsStore*/); + }; + if (MustSaveLR) - BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg) - .addImm(LROffset) - .addReg(SPReg); + RestoreRegFromStack(LoadInst, ScratchReg, LROffset); if (MustSaveCR && !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64 - BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg) - .addImm(8) - .addReg(SPReg); + RestoreRegFromStack(TII.get(PPC::LWZ8), TempReg, 8); if (HasFP) - BuildMI(MBB, MBBI, dl, LoadInst, FPReg) - .addImm(FPOffset) - .addReg(SPReg); + RestoreRegFromStack(LoadInst, FPReg, FPOffset); if (FI->usesPICBase()) // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. BuildMI(MBB, MBBI, dl, LoadInst) .addReg(PPC::R30) .addImm(PBPOffset) - .addReg(SPReg); + .addReg(CorrectSPReg); if (HasBP) - BuildMI(MBB, MBBI, dl, LoadInst, BPReg) - .addImm(BPOffset) - .addReg(SPReg); + RestoreRegFromStack(LoadInst, BPReg, BPOffset); if (MustSaveCR && !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64 @@ -1294,6 +1420,11 @@ // Callee pop calling convention. Pop parameter/linkage area. Used for tail // call optimization if (IsReturnBlock) { + // The current MBBI is SPUpdate instr, change it back to first terminator of + // MBB. + if (PlaceSPUpdateToTheEnd) + MBBI = ++MBBI; + unsigned RetOpcode = MBBI->getOpcode(); if (MF.getTarget().Options.GuaranteedTailCallOpt && (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && Index: lib/Target/PowerPC/PPCInstr64Bit.td =================================================================== --- lib/Target/PowerPC/PPCInstr64Bit.td +++ lib/Target/PowerPC/PPCInstr64Bit.td @@ -385,12 +385,13 @@ def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; -let Defs = [LR8] in { +// Add no side effects attribute so post-RA-scheduler can reorder it. +let Defs = [LR8], hasSideEffects = 0 in { def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), "mtlr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; } -let Uses = [LR8] in { +let Uses = [LR8], hasSideEffects = 0 in { def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins), "mflr $rT", IIC_SprMFSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; Index: lib/Target/PowerPC/PPCScheduleP8.td =================================================================== --- lib/Target/PowerPC/PPCScheduleP8.td +++ lib/Target/PowerPC/PPCScheduleP8.td @@ -310,15 +310,28 @@ InstrStage<1, [P8_LSU1, P8_LSU2], 0>, InstrStage<1, [P8_LU1, P8_LU2]>], [1, 1, 1]>, + // Use 2 cycles for [P8_CRU] stage, as IIC_SprMTSPR, IIC_SprMFSPR does. + // See comments in IIC_SprMTSPR. InstrItinData, - InstrStage<1, [P8_CRU]>], + InstrStage<2, [P8_CRU]>], [6, 1]>, InstrItinData, InstrStage<1, [P8_CRU]>], [3, 1]>, + // Change execution pipe from [P8_FXU1, P8_FXU2] to [P8_CRU], we tell + // post-RA-scheduler that 'mflr' and 'mfcr' use the same resource, and both + // occupy execution pipe 2 cycles, so post-RA-sched will try to separate them. + // + // The reason comes from: P8_um_external_v12_17JUN2015_pub.pdf p.221: + // "If two mfsprs or mtsprs are issued at the same time on any FXU or CRU" + // "issue port, one must be cancelled" InstrItinData, - InstrStage<1, [P8_FXU1, P8_FXU2]>], + InstrStage<2, [P8_CRU]>], [4, 1]>, // mtctr + // For modeling mflr/mfctr + InstrItinData, + InstrStage<2, [P8_CRU]>], + [6, 1]>, InstrItinData, InstrStage<1, [P8_FPU1, P8_FPU2]>], Index: test/CodeGen/PowerPC/crsave.ll =================================================================== --- test/CodeGen/PowerPC/crsave.ll +++ test/CodeGen/PowerPC/crsave.ll @@ -1,5 +1,6 @@ ; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC32 -; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC64 +; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 -ppc-relax-prologepilog-inst-order=false < %s | FileCheck %s -check-prefix=PPC64 +; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC64-OPT-PE declare void @foo() @@ -32,6 +33,14 @@ ; PPC64: mtocrf 32, 12 ; PPC64: .cfi_endproc +; PPC64-OPT-PE: .cfi_startproc +; PPC64-OPT-PE: stdu 1, -128(1) +; PPC64-OPT-PE: mfcr 12 +; PPC64-OPT-PE: stw 12, 136(1) +; PPC64-OPT-PE: .cfi_def_cfa_offset 128 +; PPC64-OPT-PE: .cfi_offset lr, 16 +; PPC64-OPT-PE: .cfi_offset cr2, 8 + define i32 @test_cr234() nounwind { entry: %ret = alloca i32, align 4 @@ -60,3 +69,6 @@ ; PPC64: mtocrf 16, 12 ; PPC64: mtocrf 8, 12 +; PPC64-OPT-PE: stdu 1, -128(1) +; PPC64-OPT-PE: mfcr 12 +; PPC64-OPT-PE: stw 12, 136(1) Index: test/CodeGen/PowerPC/ppc-shrink-wrapping.ll =================================================================== --- test/CodeGen/PowerPC/ppc-shrink-wrapping.ll +++ test/CodeGen/PowerPC/ppc-shrink-wrapping.ll @@ -39,14 +39,16 @@ ; Restore the link register and return. ; Note that there could be other epilog code before the link register is ; restored but we will not check for it here. -; ENABLE: mtlr +; ENABLE-DAG: mtlr +; ENABLE-DAG: addi 1, 1, {{[0-9]+}} ; ENABLE-NEXT: blr ; ; DISABLE: [[EXIT_LABEL]]: ; ; Without shrink-wrapping, epilogue is in the exit block. ; Epilogue code. (What we pop does not matter.) -; DISABLE: mtlr {{[0-9]+}} +; DISABLE-DAG: mtlr {{[0-9]+}} +; DISABLE-DAG: addi 1, 1, {{[0-9]+}} ; DISABLE-NEXT: blr ; @@ -109,7 +111,8 @@ ; DISABLE: .[[EPILOG_BB]]: # %if.end ; ; Epilogue code. -; CHECK: mtlr {{[0-9]+}} +; CHECK-DAG: mtlr {{[0-9]+}} +; CHECK-DAG: addi 1, 1, {{[0-9]+}} ; CHECK-NEXT: blr ; ; ENABLE: .[[ELSE_LABEL]]: # %if.else @@ -170,7 +173,8 @@ ; ; Next BB ; CHECK: %for.exit -; CHECK: mtlr {{[0-9]+}} +; CHECK-DAG: mtlr {{[0-9]+}} +; CHECK-DAG: addi 1, 1, {{[0-9]+}} ; CHECK-NEXT: blr define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) { entry: @@ -237,7 +241,8 @@ ; ; DISABLE: .[[EPILOG_BB]]: # %if.end ; Epilog code -; CHECK: mtlr {{[0-9]+}} +; CHECK-DAG: mtlr {{[0-9]+}} +; CHECK-DAG: addi 1, 1, {{[0-9]+}} ; CHECK-NEXT: blr ; ; ENABLE: .[[ELSE_LABEL]]: # %if.else @@ -289,7 +294,8 @@ ; Make sure we save the link register ; CHECK: mflr {{[0-9]+}} ; -; DISABLE: cmplwi 0, 3, 0 +; DISABLE-DAG: cmplwi 0, 3, 0 +; DISABLE-DAG: std 0, {{[0-9]+([0-9]+)}} ; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]] ; ; CHECK: bl somethingElse @@ -317,7 +323,8 @@ ; DISABLE: .[[EPILOG_BB]]: # %if.end ; ; Epilogue code. -; CHECK: mtlr {{[0-9]+}} +; CHECK-DAG: mtlr {{[0-9]+}} +; CHECK-DAG: addi 1, 1, {{[0-9]+}} ; CHECK-NEXT: blr ; ; ENABLE: .[[ELSE_LABEL]]: # %if.else @@ -454,7 +461,8 @@ ; CHECK: slwi 3, 3, 3 ; DISABLE: b .[[EPILOGUE_BB:LBB[0-9_]+]] ; -; ENABLE: mtlr {{[0-9]+}} +; ENABLE-DAG: mtlr {{[0-9]+}} +; ENABLE-DAG: addi 1, 1, {{[0-9]+}} ; ENABLE-NEXT: blr ; ; CHECK: .[[ELSE_LABEL]]: # %if.else @@ -499,6 +507,7 @@ ; CHECK: li 3, 42 ; ; DISABLE: mtlr {{[0-9]+}} +; DISABLE: addi 1, 1, {{[0-9]+}} ; ; CHECK-NEXT: blr ; Index: test/CodeGen/PowerPC/ppc64-calls.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-calls.ll +++ test/CodeGen/PowerPC/ppc64-calls.ll @@ -36,7 +36,7 @@ ; CHECK: ld [[FP:[0-9]+]], 0(3) ; CHECK: ld 11, 16(3) ; CHECK: ld 2, 8(3) -; CHECK-NEXT: mtctr [[FP]] +; CHECK: mtctr [[FP]] ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 40(1) ret void @@ -51,7 +51,7 @@ ; CHECK: ld [[FP:[0-9]+]], 1024(0) ; CHECK: ld 11, 1040(0) ; CHECK: ld 2, 1032(0) -; CHECK-NEXT: mtctr [[FP]] +; CHECK: mtctr [[FP]] ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 40(1) ret void Index: test/CodeGen/PowerPC/ppc64-nest.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-nest.ll +++ test/CodeGen/PowerPC/ppc64-nest.ll @@ -17,7 +17,7 @@ define i8* @nest_caller(i8* %arg) nounwind { ; CHECK-LABEL: nest_caller: ; CHECK: mr 11, 3 -; CHECK-NEXT: bl nest_receiver +; CHECK: bl nest_receiver ; CHECK: blr %result = call i8* @nest_receiver(i8* nest %arg) Index: test/CodeGen/PowerPC/ppc64-stackmap-nops.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-stackmap-nops.ll +++ test/CodeGen/PowerPC/ppc64-stackmap-nops.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=powerpc64-unknown-gnu-linux | FileCheck %s +; RUN: llc < %s -ppc-relax-prologepilog-inst-order=false -mtriple=powerpc64-unknown-gnu-linux | FileCheck %s define void @test_shadow_optimization() { entry: Index: test/CodeGen/PowerPC/retaddr2.ll =================================================================== --- test/CodeGen/PowerPC/retaddr2.ll +++ test/CodeGen/PowerPC/retaddr2.ll @@ -1,4 +1,5 @@ -; RUN: llc -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -mcpu=pwr7 -ppc-relax-prologepilog-inst-order=false < %s | FileCheck %s +; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=CHECK-OPT-PE target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -17,6 +18,15 @@ ; CHECK: mtlr [[SR]] ; CHECK: blr +; CHECK-OPT-PE-LABEL: @test1 +; CHECK-OPT-PE: mflr {{[0-9]+}} +; CHECK-OPT-PE: std 0, 64(1) +; CHECK-OPT-PE-NOT: addi 1, 1 +; CHECK-OPT-PE-DAG: ld 3, 64(1) +; CHECK-OPT-PE-DAG: ld [[SR:[0-9]+]], 64(1) +; CHECK-OPT-PE: mtlr [[SR]] +; CHECK-OPT-PE: blr + ; Function Attrs: nounwind readnone declare i8* @llvm.returnaddress(i32) #0 Index: test/CodeGen/PowerPC/sjlj.ll =================================================================== --- test/CodeGen/PowerPC/sjlj.ll +++ test/CodeGen/PowerPC/sjlj.ll @@ -87,8 +87,8 @@ ; CHECK: blr ; CHECK: .LBB1_5: -; CHECK: mflr [[REGL:[0-9]+]] -; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload +; CHECK-DAG: mflr [[REGL:[0-9]+]] +; CHECK-DAG: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload ; CHECK: std [[REGL]], 8([[REG2]]) ; CHECK: li 3, 0