Index: lib/Target/PowerPC/P9InstrResources.td =================================================================== --- lib/Target/PowerPC/P9InstrResources.td +++ lib/Target/PowerPC/P9InstrResources.td @@ -12,11 +12,29 @@ // is listed here. Instructions in this file belong to itinerary classes that // have instructions with different resource requirements. // +// The makeup of the P9 CPU is modeled as follows: +// - Each CPU is made up of two superslices. +// - Each superslice is made up of two slices. Therefore, there are 4 slices +// for each CPU. +// - Up to 6 instructions can be dispatched to each CPU. Three per superslice. +// - Each CPU has: +// - One CY (Crypto) unit P9_CY_* +// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_* +// - Two PM (Permute) units. One on each superslice. P9_PM_* +// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_* +// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_* +// - Four DP (Floating Point) units. One on each slice. P9_DP_* +// This also includes fixed point multiply add. +// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_* +// - Four Load/Store Queues. P9_LS_* +// - Each set of instructions will require a number of these resources. //===----------------------------------------------------------------------===// - +// Two cycle ALU vector operation that uses an entire superslice. +// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C], + DISP_1C, DISP_1C, DISP_1C], (instrs VADDCUW, VADDUBM, @@ -26,47 +44,41 @@ VAND, VANDC, VCMPEQUB, - VCMPEQUBo, VCMPEQUD, - VCMPEQUDo, VCMPEQUH, - VCMPEQUHo, VCMPEQUW, - VCMPEQUWo, - VCMPGTSB, - VCMPGTSBo, - VCMPGTSD, - VCMPGTSDo, - VCMPGTSH, - VCMPGTSHo, - VCMPGTSW, - VCMPGTSWo, - VCMPGTUB, - VCMPGTUBo, - VCMPGTUD, - VCMPGTUDo, - VCMPGTUH, - VCMPGTUHo, - VCMPGTUW, - VCMPGTUWo, VCMPNEB, - VCMPNEBo, VCMPNEH, - VCMPNEHo, VCMPNEW, - VCMPNEWo, VCMPNEZB, - VCMPNEZBo, VCMPNEZH, - VCMPNEZHo, VCMPNEZW, - VCMPNEZWo, VEQV, VEXTSB2D, VEXTSB2W, VEXTSH2D, VEXTSH2W, VEXTSW2D, + VRLB, + VRLD, + VRLDMI, + VRLDNM, + VRLH, + VRLW, + VRLWMI, + VRLWNM, + VSRAB, + VSRAD, + VSRAH, + VSRAW, + VSRB, + VSRD, + VSRH, + VSRW, + VSLB, + VSLD, + VSLH, + VSLW, VMRGEW, VMRGOW, VNAND, @@ -77,9 +89,7 @@ VORC, VPOPCNTB, VPOPCNTH, - VPOPCNTW, VSEL, - VSUBCUW, VSUBUBM, VSUBUDM, VSUBUHM, @@ -98,6 +108,8 @@ XVNEGDP, XVNEGSP, XVXEXPDP, + XVIEXPSP, + XVXEXPSP, XXLAND, XXLANDC, XXLEQV, @@ -107,28 +119,128 @@ XXLORf, XXLORC, XXLXOR, - XXSEL -)>; - -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs + XXSEL, XSABSQP, XSCPSGNQP, XSIEXPQP, XSNABSQP, XSNEGQP, - XSXEXPQP, - XSABSDP, - XSCPSGNDP, - XSIEXPDP, + XSXEXPQP +)>; + +// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a +// slingle slice. However, since it is Restricted it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + FCMPUS, + FCMPUD, + XSTSTDCDP, + XSTSTDCSP +)>; + +// Standard Dispatch ALU operation for 3 cycles. Only one slice used. +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], + (instrs + XSMAXCDP, + XSMAXDP, + XSMAXJDP, + XSMINCDP, + XSMINDP, + XSMINJDP, + XSTDIVDP, + XSTSQRTDP, + XSCMPEQDP, + XSCMPEXPDP, + XSCMPGEDP, + XSCMPGTDP, + XSCMPODP, + XSCMPUDP, + XSXSIGDP, + XSCVSPDPN +)>; + +// Standard Dispatch ALU operation for 2 cycles. Only one slice used. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], + (instrs + ADDIStocHA, + ADDItocL, + MCRF, + MCRXRX, + SLD, + SRD, + SRAD, + SRADI, + RLDIC, XSNABSDP, + XSXEXPDP, + XSABSDP, XSNEGDP, - XSXEXPDP + XSCPSGNDP )>; -def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a +// slingle slice. However, since it is Restricted it requires all 3 dispatches +// (DISP) for that superslice. +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + RLDCL, + RLDCR, + RLDIMI, + RLDICL, + RLDICR, + RLDICL_32_64, + XSIEXPDP, + FMR, + FABSD, + FABSS, + FNABSD, + FNABSS, + FNEGD, + FNEGS, + FCPSGND, + FCPSGNS +)>; +// Three cycle ALU vector operation that uses an entire superslice. +// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], + (instrs + VBPERMD, + VABSDUB, + VABSDUH, + VABSDUW, + VADDUBS, + VADDUHS, + VADDUWS, + VAVGSB, + VAVGSH, + VAVGSW, + VAVGUB, + VAVGUH, + VAVGUW, + VCMPEQFP, + VCMPEQFPo, + VCMPGEFP, + VCMPGEFPo, + VCMPBFP, + VCMPBFPo, + VCMPGTFP, + VCMPGTFPo, + VCLZB, + VCLZD, + VCLZH, + VCLZW, + VCTZB, + VCTZD, + VCTZH, + VCTZW, + VADDSBS, + VADDSHS, + VADDSWS, + VMINFP, VMINSB, VMINSD, VMINSH, @@ -137,55 +249,54 @@ VMINUD, VMINUH, VMINUW, + VMAXFP, + VMAXSB, + VMAXSD, + VMAXSH, + VMAXSW, + VMAXUB, + VMAXUD, + VMAXUH, + VMAXUW, + VPOPCNTW, VPOPCNTD, VPRTYBD, VPRTYBW, - VRLB, - VRLD, - VRLDMI, - VRLDNM, - VRLH, - VRLW, - VRLWMI, - VRLWNM, VSHASIGMAD, VSHASIGMAW, - VSLB, - VSLD, - VSLH, - VSLW, - VSRAB, - VSRAD, - VSRAH, - VSRAW, - VSRB, - VSRD, - VSRH, - VSRW, VSUBSBS, VSUBSHS, VSUBSWS, VSUBUBS, VSUBUHS, VSUBUWS, - XSCMPEQDP, - XSCMPEXPDP, - XSCMPGEDP, - XSCMPGTDP, - XSCMPODP, - XSCMPUDP, - XSCVSPDPN, - XSMAXCDP, - XSMAXDP, - XSMAXJDP, - XSMINCDP, - XSMINDP, - XSMINJDP, - XSTDIVDP, - XSTSQRTDP, - XSTSTDCDP, - XSTSTDCSP, - XSXSIGDP, + VSUBCUW, + VCMPGTSB, + VCMPGTSBo, + VCMPGTSD, + VCMPGTSDo, + VCMPGTSH, + VCMPGTSHo, + VCMPGTSW, + VCMPGTSWo, + VCMPGTUB, + VCMPGTUBo, + VCMPGTUD, + VCMPGTUDo, + VCMPGTUH, + VCMPGTUHo, + VCMPGTUW, + VCMPGTUWo, + VCMPNEBo, + VCMPNEHo, + VCMPNEWo, + VCMPNEZBo, + VCMPNEZHo, + VCMPNEZWo, + VCMPEQUBo, + VCMPEQUDo, + VCMPEQUHo, + VCMPEQUWo, XVCMPEQDP, XVCMPEQDPo, XVCMPEQSP, @@ -198,7 +309,6 @@ XVCMPGTDPo, XVCMPGTSP, XVCMPGTSPo, - XVIEXPSP, XVMAXDP, XVMAXSP, XVMINDP, @@ -209,58 +319,15 @@ XVTSQRTSP, XVTSTDCDP, XVTSTDCSP, - XVXEXPSP, XVXSIGDP, XVXSIGSP )>; -def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], - (instrs - VABSDUB, - VABSDUH, - VABSDUW, - VADDSBS, - VADDSHS, - VADDSWS, - VADDUBS, - VADDUHS, - VADDUWS, - VAVGSB, - VAVGSH, - VAVGSW, - VAVGUB, - VAVGUH, - VAVGUW, - VBPERMD, - VCLZB, - VCLZD, - VCLZH, - VCLZW, - VCMPBFP, - VCMPBFPo, - VCMPGTFP, - VCMPGTFPo, - VCTZB, - VCTZD, - VCTZH, - VCTZW, - VMAXFP, - VMAXSB, - VMAXSD, - VMAXSH, - VMAXSW, - VMAXUB, - VMAXUD, - VMAXUH, - VMAXUW, - VMINFP, - VCMPEQFP, - VCMPEQFPo, - VCMPGEFP, - VCMPGEFPo -)>; - -def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 7 cycle DP vector operation that uses an entire superslice. +// Uses both DP units (the even DPE and odd DPO units), two pipelines +// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. +def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs VADDFP, VCTSXS, @@ -367,8 +434,47 @@ VSUMSWS )>; +// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three +// dispatch units for the superslice. def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs + FRSP, + FRIND, + FRINS, + FRIPD, + FRIPS, + FRIZD, + FRIZS, + FRIMD, + FRIMS, + FRE, + FRES, + FRSQRTE, + FRSQRTES, + FMADDS, + FMADD, + FMSUBS, + FMSUB, + FNMADDS, + FNMADD, + FNMSUBS, + FNMSUB, + FSELD, + FSELS, + FADDS, + FMULS, + FMUL, + FSUBS, + FCFID, + FCTID, + FCTIDZ, + FCFIDU, + FCFIDS, + FCFIDUS, + FCTIDUZ, + FCTIWUZ, + FCTIW, + FCTIWZ, XSMADDADP, XSMADDASP, XSMADDMDP, @@ -389,7 +495,7 @@ XSNMSUBMSP )>; - +// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units. def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs XSADDDP, @@ -397,8 +503,10 @@ XSCVDPHP, XSCVDPSP, XSCVDPSXDS, + XSCVDPSXDSs, XSCVDPSXWS, XSCVDPUXDS, + XSCVDPUXDSs, XSCVDPUXWS, XSCVHPDP, XSCVSPDP, @@ -421,7 +529,10 @@ XSCVDPSPN )>; -def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], +// Three Cycle PM operation. Only one PM unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], (instrs VBPERMQ, VCLZLSBB, @@ -469,7 +580,9 @@ VSLO, VSLV, VSPLTB, + VSPLTBs, VSPLTH, + VSPLTHs, VSPLTISB, VSPLTISH, VSPLTISW, @@ -498,6 +611,9 @@ XXSLDWI, XXSPLTIB, XXSPLTW, + XXSPLTWs, + XXPERMDI, + XXPERMDIs, VADDCUQ, VADDECUQ, VADDEUQM, @@ -517,7 +633,10 @@ XSXSIGQP )>; -def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSADDQP, XSADDQPO, @@ -536,7 +655,10 @@ XSSUBQPO )>; -def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSMADDQP, XSMADDQPO, @@ -550,45 +672,56 @@ XSNMSUBQPO )>; -def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSDIVQP, XSDIVQPO )>; -def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], (instrs XSSQRTQP, XSSQRTQPO )>; -// Load Operation in IIC_LdStLFD - +// 5 Cycle load uses a single slice. def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs LXSDX, LXVD2X, LXSIWZX, LXV, - LXSD + LXVX, + LXSD, + DFLOADf64 )>; -def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +// 4 Cycle load uses a single slice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs - LFIWZX, - LFDX, - LFD + COPY )>; -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// 4 Cycle Restricted load uses a single slice but the dispatch for the whole +// superslice. +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - LXSSPX, - LXSIWAX, - LXSSP + LFIWZX, + LFDX, + LFD )>; -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, +// Cracked Restricted Load instruction. +// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU +// operations cannot be done at the same time and so their latencies are added. +// Full 6 dispatches are required as this is both cracked and restricted. +def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LFIWAX, @@ -596,14 +729,35 @@ LFS )>; -def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], +// Cracked Load instruction. +// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU +// operations cannot be done at the same time and so their latencies are added. +// Full 4 dispatches are required as this is a cracked instruction. +def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + LXSSPX, + LXSIWAX, + LXSSP, + DFLOADf32 +)>; + +// Cracked Load that requires the PM resource. +// Since the Load and the PM cannot be done at the same time the latencies are +// added. Requires 8 cycles. +// Since the PM requires the full superslice we need both EXECE, EXECO pipelines +// as well as 3 dispatches for the PM. The Load requires the remaining 2 +// dispatches. +def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LXVDSX, + LXVWSX, LXVW4X )>; -// Store Operations in IIC_LdStSTFD. - +// Single slice Restricted store operation. The restricted operation requires +// all three dispatches for the superslice. def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], (instrs STFS, @@ -613,74 +767,83 @@ STFDX, STXSDX, STXSSPX, - STXSIWX + STXSIWX, + DFSTOREf32, + DFSTOREf64 )>; -def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C], +// Store operation that requires the whole superslice. +def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs STXVD2X, STXVW4X )>; -// Divide Operations in IIC_IntDivW, IIC_IntDivD. - -def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVW, - DIVWU + DIVWU, + MODSW )>; -def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVWE, DIVD, DIVWEU, - DIVDU + DIVDU, + MODSD, + MODUD, + MODUW )>; -def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C], +// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs DIVDE, DIVDEU )>; -def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// Cracked DIV and ALU operation. Requires one full slice for the ALU operation +// and one full superslice for the DIV operation since there is only one DIV +// per superslice. Latency of DIV plus ALU is 26. +def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs DIVWEo, DIVWEUo )>; -def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +// Cracked DIV and ALU operation. Requires one full slice for the ALU operation +// and one full superslice for the DIV operation since there is only one DIV +// per superslice. Latency of DIV plus ALU is 42. +def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs DIVDEo, DIVDEUo )>; -// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs - SLD, - SRD, - SRAD, - SRADI, - RLDIC -)>; - -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], - (instrs - RLDCL, - RLDCR, - RLDIMI, - RLDICL, - RLDICR, - RLDICL_32_64 -)>; - // CR access instructions in _BrMCR, IIC_BrMCRX. +// Cracked, restricted, ALU operations. +// Here the two ALU ops can actually be done in parallel and therefore the +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. +// ALU ops are 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs @@ -690,13 +853,12 @@ MTCRF8 )>; -def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C], - (instrs - MCRF, - MCRXRX -)>; - -def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C, +// Cracked, restricted, ALU operations. +// Here the two ALU ops can actually be done in parallel and therefore the +// latencies are not added together. Otherwise this is like having two +// instructions running together on two pipelines and 6 dispatches. +// ALU ops are 3 cycles each. +def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs MCRFS @@ -704,93 +866,57 @@ // FP Div instructions in IIC_FPDivD and IIC_FPDivS. +// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - FDIV, - XSDIVDP + FDIV )>; -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction. Takes one slice and 2 dispatches. +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - FDIVS, - XSDIVSP -)>; - -def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], - (instrs - XVDIVSP + XSDIVDP )>; -def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], (instrs - XVDIVDP + FDIVS )>; -// FP Instructions in IIC_FPGeneral, IIC_FPFused - -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction. Takes one slice and 2 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - FRSP, - FRIND, - FRINS, - FRIPD, - FRIPS, - FRIZD, - FRIZS, - FRIMD, - FRIMS, - FRE, - FRES, - FRSQRTE, - FRSQRTES, - FMADDS, - FMADD, - FMSUBS, - FMSUB, - FNMADDS, - FNMADD, - FNMSUBS, - FNMSUB, - FSELD, - FSELS, - FADDS, - FMULS, - FMUL, - FSUBS, - FCFID, - FCTID, - FCTIDZ, - FCFIDU, - FCFIDS, - FCFIDUS, - FCTIDUZ, - FCTIWUZ, - FCTIW, - FCTIWZ + XSDIVSP )>; -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 24 Cycle DP Vector Instruction. Takes one full superslice. +// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// superslice. +def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs - FMR, - FABSD, - FABSS, - FNABSD, - FNABSS, - FNEGD, - FNEGS, - FCPSGND, - FCPSGNS + XVDIVSP )>; -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Vector Instruction. Takes one full superslice. +// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// superslice. +def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C, DISP_1C], (instrs - FCMPUS, - FCMPUD + XVDIVDP )>; // Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX. -def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C, +// Instruction cracked into three pieces. One Load and two ALU operations. +// The Load and one of the ALU ops cannot be run at the same time and so the +// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles. +// Both the load and the ALU that depends on it are restricted and so they take +// a total of 6 dispatches. The final 2 dispatches come from the second ALU op. +// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load. +def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -799,10 +925,32 @@ LFSUX )>; -def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, +// Cracked instruction made up of a Load and an ALU. The ALU does not depend on +// the load and so it can be run at the same time as the load. The load is also +// restricted. 3 dispatches are from the restricted load while the other two +// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline +// is required for the ALU. +def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], (instrs LFDU, LFDUX )>; +// Crypto Instructions + +// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole +// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// dispatches. +def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], + (instrs + VPMSUMB, + VPMSUMD, + VPMSUMH, + VPMSUMW, + VCIPHER, + VCIPHERLAST, + VNCIPHER, + VNCIPHERLAST, + VSBOX +)>; Index: lib/Target/PowerPC/PPCInstrFormats.td =================================================================== --- lib/Target/PowerPC/PPCInstrFormats.td +++ lib/Target/PowerPC/PPCInstrFormats.td @@ -2057,4 +2057,5 @@ let PPC64 = 0; let Pattern = pattern; let Inst{31-0} = 0; + let hasNoSchedulingInfo = 1; } Index: lib/Target/PowerPC/PPCInstrInfo.td =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.td +++ lib/Target/PowerPC/PPCInstrInfo.td @@ -3951,6 +3951,7 @@ let AsmString = asm; let isAsmParserOnly = 1; let isPseudo = 1; + let hasNoSchedulingInfo = 1; } def : InstAlias<"sc", (SC 0)>; Index: lib/Target/PowerPC/PPCScheduleP9.td =================================================================== --- lib/Target/PowerPC/PPCScheduleP9.td +++ lib/Target/PowerPC/PPCScheduleP9.td @@ -22,7 +22,9 @@ // Try to make sure we have at least 10 dispatch groups in a loop. let LoopMicroOpBufferSize = 60; - let CompleteModel = 0; + let CompleteModel = 1; + + let UnsupportedFeatures = [HasQPX]; } @@ -68,6 +70,10 @@ def LS : ProcResource<4>; def PM : ProcResource<2>; def DFU : ProcResource<1>; + def BR : ProcResource<1> { + let BufferSize = 16; + } + def CY : ProcResource<1>; def TestGroup : ProcResGroup<[ALU, DP]>; @@ -145,6 +151,10 @@ let Latency = 6; } + def P9_DIV_12C : SchedWriteRes<[DIV]> { + let Latency = 12; + } + def P9_DIV_16C_8 : SchedWriteRes<[DIV]> { let ResourceCycles = [8]; let Latency = 16; @@ -190,6 +200,16 @@ let Latency = 24; } + def P9_DPO_24C_8 : SchedWriteRes<[DPO]> { + let ResourceCycles = [8]; + let Latency = 24; + } + + def P9_DPE_24C_8 : SchedWriteRes<[DPE]> { + let ResourceCycles = [8]; + let Latency = 24; + } + def P9_DP_26C_5 : SchedWriteRes<[DP]> { let ResourceCycles = [5]; let Latency = 22; @@ -205,6 +225,16 @@ let Latency = 33; } + def P9_DPE_33C_8 : SchedWriteRes<[DPE]> { + let ResourceCycles = [8]; + let Latency = 33; + } + + def P9_DPO_33C_8 : SchedWriteRes<[DPO]> { + let ResourceCycles = [8]; + let Latency = 33; + } + def P9_DP_36C_10 : SchedWriteRes<[DP]> { let ResourceCycles = [10]; let Latency = 36; @@ -248,11 +278,25 @@ let Latency = 76; let ResourceCycles = [62]; } + + def P9_BR_2C : SchedWriteRes<[BR]> { + let Latency = 2; + } + + def P9_BR_5C : SchedWriteRes<[BR]> { + let Latency = 5; + } + + def P9_CY_6C : SchedWriteRes<[CY]> { + let Latency = 6; + } + // ***************** WriteSeq Definitions ***************** def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>; def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>; def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>; + def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>; def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>; def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>; def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>; @@ -260,19 +304,32 @@ // ***************** Defining Itinerary Class Resources ***************** + // The following itineraries are fully covered by the InstRW definitions in + // P9InstrResources.td so aren't listed here. + // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU, + // IIC_LdStLFDUX + def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], - [IIC_IntSimple, IIC_IntGeneral]>; + [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID, + IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD, + IIC_SprRFI]>; + + def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_IntTrapW]>; def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>; def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>; + def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>; + def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], - [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>; + [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>; def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C], - [IIC_LdStLoad, IIC_LdStLD]>; + [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>; def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -300,12 +357,18 @@ def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>; + def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF, + IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC, + IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>; + def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>; def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], - [IIC_LdStSTDU, IIC_LdStSTDUX]>; + [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG, + IIC_SprTLBIA, IIC_SprTLBIE]>; def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], @@ -315,20 +378,44 @@ [IIC_BrCR, IIC_IntMTFSB0]>; def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, - IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>; + IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, + DISP_1C, DISP_1C, DISP_1C, DISP_1C], + [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>; + + def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>; + def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>; // This class should be broken down to instruction level, once some missing // info is obtained. def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>; - def : ItinRW<[P9_DP_7C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>; + def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE, + IIC_SprTLBIEL]>; + + // IIC_VecFP is added here although many instructions with that itinerary + // use very different resources. It would appear that instructions were + // given that itinerary rather carelessly over time. Specific instructions + // that use different resources are listed in various InstrRW classes. + def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], + [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>; + + def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, + DISP_1C, DISP_1C], [IIC_VecFPCompare]>; + + def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C], + [IIC_VecPerm]>; def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>; def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>; + def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C], + [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB, + IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>; + + def : ItinRW<[], [IIC_SprSTOP]>; + include "P9InstrResources.td" }